library(tidyverse)
library(plotly)
library(readxl)
library(lubridate)
knitr::opts_chunk$set(
fig.width = 10,
fig.asp = .2,
out.width = "90%",
dpi = 300
)
#read crime_df data
crime_df = read_csv("data/crime_df.csv")
head(crime_df)
## # A tibble: 6 × 18
## ...1 date hour_of…¹ borough level offense offen…² place suspe…³ suspe…⁴
## <dbl> <date> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 2017-12-22 0 MANHAT… FELO… GRAND … LARCEN… BAR/… UNKNOWN UNKNOWN
## 2 2 2017-12-14 9 QUEENS VIOL… HARRAS… HARASS… COMM… 25-44 BLACK
## 3 3 2017-02-01 8 STATEN… MISD… SEX CR… SEXUAL… RESI… UNKNOWN UNKNOWN
## 4 4 2017-12-29 15 QUEENS MISD… CRIMIN… CRIMIN… CHAI… <NA> <NA>
## 5 5 2017-03-01 2 MANHAT… VIOL… HARRAS… HARASS… RESI… UNKNOWN WHITE
## 6 6 2017-12-22 10 MANHAT… MISD… PETIT … LARCEN… STRE… UNKNOWN UNKNOWN
## # … with 8 more variables: suspect_sex <chr>, victim_age <chr>,
## # victim_race <chr>, victim_sex <chr>, day_of_week <chr>, mon <dbl>,
## # latitude <dbl>, longitude <dbl>, and abbreviated variable names
## # ¹hour_of_day, ²offense_classification, ³suspect_age, ⁴suspect_race
#define crime_df_map for our map
crime_df_map = drop_na(crime_df, latitude)
crime_plot =
crime_df %>%
filter(!is.na(borough)) %>%
plot_ly(
lat = ~latitude,
lon = ~longitude,
frame = ~hour_of_day,
type = "scattermapbox",
mode = "markers",
alpha = 0.2,
color = ~borough) %>%
layout(
mapbox = list(
style = 'carto-positron',
zoom = 9,
center = list(lon = -73.9, lat = 40.7)))
crime_plot
library(sp) #for analyzing spatial data
library(sf) #for analyzing spatial data
## Linking to GEOS 3.9.3, GDAL 3.5.2, PROJ 8.2.1; sf_use_s2() is TRUE
library(rgdal) #for importing zips shapefile and transform CRS
## Please note that rgdal will be retired during 2023,
## plan transition to sf/stars/terra functions using GDAL and PROJ
## at your earliest convenience.
## See https://r-spatial.org/r/2022/04/12/evolution.html and https://github.com/r-spatial/evolution
## rgdal: version: 1.6-2, (SVN revision 1183)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.5.2, released 2022/09/02
## Path to GDAL shared files: C:/Users/30535/AppData/Local/R/win-library/4.2/rgdal/gdal
## GDAL binary built with GEOS: TRUE
## Loaded PROJ runtime: Rel. 8.2.1, January 1st, 2022, [PJ_VERSION: 821]
## Path to PROJ shared files: C:/Users/30535/AppData/Local/R/win-library/4.2/rgdal/proj
## PROJ CDN enabled: FALSE
## Linking to sp version:1.5-1
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading sp or rgdal.
library(leaflet) #for interactive maps
library(tigris) #geojoin
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library(htmlwidgets) #save interactive maps
#import zips shapefile and transform CRS
zips = readOGR("data/cb_2015_us_zcta510_500k/cb_2015_us_zcta510_500k.shp")
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## OGR data source with driver: ESRI Shapefile
## Source: "C:\Users\30535\OneDrive\Documents\Columbia courses\8105\crimeweather_final_project.github.io\data\cb_2015_us_zcta510_500k\cb_2015_us_zcta510_500k.shp", layer: "cb_2015_us_zcta510_500k"
## with 33144 features
## It has 5 fields
## Integer64 fields read as strings: ALAND10 AWATER10
zips = spTransform(zips, CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"))
#extract only lon and lat
crime_lat_long = select(crime_df_map, longitude, latitude)
#transform coordinates into a SpatialPointsDataFrame
crime_spdf = SpatialPointsDataFrame(coords = crime_lat_long, data = crime_lat_long, proj4string = CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"))
#subset only the zipcodes in which points are found
crime_zips = zips[crime_spdf, ]
#redefine crime_df_map to include zipcodes
crime_df_map = cbind(crime_df_map, over(crime_spdf, crime_zips[,"ZCTA5CE10"]))
crime_df_map = rename(crime_df_map, zip = ZCTA5CE10)
300 crime incidents can’t be matched to a zip code, so we drop them.
crime_df_map =
crime_df_map %>%
drop_na(zip)
In order to make meaningful data analysis, we want zip codes
represent regions with similar population. However, some zip code
regions are very small, among which there are even some buildings that
have their own zip codes. To deal with the challenges of zip codes, we
convert zip codes to modified zip codes to account for discrepancies in
population size. Modified zip codes represent regions with similar
population size. In order to make the conversion, we use a conversion
table ZCTA-to-MODZCTA.csv obtained from here.
zcta_conv = read_csv("data/ZCTA-to-MODZCTA.csv")
## Rows: 215 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): ZCTA, MODZCTA
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
zcta_conv$ZCTA = as.character(zcta_conv$ZCTA)
zcta_conv$MODZCTA = as.character(zcta_conv$MODZCTA)
# Convert zip to mod_zip
crime_df_map = crime_df_map %>%
left_join(rename(zcta_conv, zip = ZCTA), by = "zip") %>%
rename(mod_zip = MODZCTA)
0 crime incidents can’t be matched to a modified zip code and leave
NA values in the mod_zip column. This is because the 0
crime incidents happened in zip code 10550, which is not in New York
City. Therefore, we drop the 0 observations.
crime_df_map = drop_na(crime_df_map, mod_zip)
We can then obtain number of crimes in each zip region, sorting in descending order.
number_of_crimes =
crime_df_map %>%
group_by(mod_zip) %>%
summarize(number_of_crimes = n()) %>%
arrange(desc(number_of_crimes))
number_of_crimes
## # A tibble: 177 × 2
## mod_zip number_of_crimes
## <chr> <int>
## 1 11212 2125
## 2 11207 2114
## 3 11208 2044
## 4 10467 1911
## 5 10456 1899
## 6 10029 1806
## 7 10457 1757
## 8 11206 1673
## 9 10452 1650
## 10 10027 1631
## # … with 167 more rows
In order to obtain crime rate, we get population of each zip code region from here.
pop_zip = read_csv("data/csvData.csv")
## Rows: 1792 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): city, county
## dbl (2): zip, population
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pop_zip$zip = as.character(pop_zip$zip)
#filter out regions in New York City
pop_zip = pop_zip %>%
filter(county %in% c("New York", "Kings", "Queens", "Bronx", "Richmond"))
The dataset pop_zip lacks population data in zip code
region 10065, so we manually add it in (data
source here).
pop_zip[nrow(pop_zip) + 1,] = list("10065","
New York City", "New York", 30339)
We then convert zip code to modified zip code, and get the population in each modified zip code region.
pop_mod_zip = pop_zip %>%
left_join(rename(zcta_conv, zip = ZCTA), by = "zip") %>%
rename(mod_zip = MODZCTA) %>%
group_by(mod_zip) %>%
summarize(population = sum(population))
Add population of each modified zip code region into
number_of_crimes, and calculate crime rate in each modified
zip code region. Crime rates are sorted in descending order.
crime_rate = left_join(number_of_crimes, pop_mod_zip, by = "mod_zip")
crime_rate$crime_rate = crime_rate$number_of_crimes / crime_rate$population * 100
crime_rate =
crime_rate %>%
select(mod_zip, crime_rate) %>%
group_by(crime_rate) %>%
arrange(desc(crime_rate))
as.tibble(crime_rate)
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## # A tibble: 177 × 2
## mod_zip crime_rate
## <chr> <dbl>
## 1 10075 11.2
## 2 10018 9.00
## 3 10001 6.23
## 4 10004 6.19
## 5 10006 5.61
## 6 10036 4.96
## 7 10035 4.03
## 8 10455 3.46
## 9 10474 3.45
## 10 10013 3.25
## # … with 167 more rows
Read the modzcta shapefile obtained from here.
modzcta = st_read("data/MODZCTA_2010/MODZCTA_2010.shp") %>%
janitor::clean_names()
## Reading layer `MODZCTA_2010' from data source
## `C:\Users\30535\OneDrive\Documents\Columbia courses\8105\crimeweather_final_project.github.io\data\MODZCTA_2010\MODZCTA_2010.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 178 features and 2 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 913176 ymin: 120122 xmax: 1067382 ymax: 272844
## Projected CRS: Lambert_Conformal_Conic
Merge crime rate data with modzcta shapefile.
crime_rate_modzcta = geo_join(modzcta, crime_rate, "modzcta", "mod_zip", how = "inner")
## Warning: We recommend using the dplyr::*_join() family of functions instead.
Make interactive map of crime rate.
#label
labels = sprintf(
"<strong>%s</strong><br/>Crime rate is %g%", crime_rate_modzcta$modzcta, crime_rate_modzcta$crime_rate) %>%
lapply(htmltools::HTML)
#color palette
pal = colorBin(palette = "OrRd", 9, domain = crime_rate_modzcta$crime_rate)
map_interactive = crime_rate_modzcta %>%
st_transform(crs = "+init=epsg:4326") %>%
leaflet() %>%
addProviderTiles(provider = "CartoDB.Positron") %>%
addPolygons(label = labels,
stroke = FALSE,
smoothFactor = 0.5,
opacity = 1,
fillOpacity = 0.7,
fillColor = ~ pal(crime_rate),
highlightOptions = highlightOptions(weight = 5,
fillOpacity = 1,
color = "black",
opacity = 1,
bringToFront = TRUE)) %>%
addLegend("bottomright",
pal = pal,
values = ~ crime_rate,
title = "Crime cases per 100 residents",
opacity = 0.7)
## Warning in CPL_crs_from_input(x): GDAL Message 1: +init=epsg:XXXX syntax is
## deprecated. It might return a CRS with a non-EPSG compliant axis order.
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette OrRd is 9
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette OrRd is 9
## Returning the palette you asked for with that many colors
saveWidget(map_interactive, "nyc_crime_rate_map.html")
map_interactive
crime_month = crime_df_map %>%
group_by(mod_zip, date) %>%
summarize(crime_counts = n())
## `summarise()` has grouped output by 'mod_zip'. You can override using the
## `.groups` argument.
crime_month$date = cut(crime_month$date, "30 days")
crime_month = crime_month %>%
group_by(mod_zip, date) %>%
summarize(crime_counts = sum(crime_counts)) %>%
rename(month_following = date)
## `summarise()` has grouped output by 'mod_zip'. You can override using the
## `.groups` argument.
crime_month = left_join(crime_month, pop_mod_zip, by = "mod_zip") %>%
select(mod_zip, population, month_following, crime_counts)
crime_month$crime_rate = crime_month$crime_counts / crime_month$population
crime_month = crime_month %>%
select(month_following, mod_zip, population, crime_counts, crime_rate) %>%
arrange(month_following)
crime_month = geo_join(modzcta, crime_month, "modzcta", "mod_zip", how = "inner")
## Warning: We recommend using the dplyr::*_join() family of functions instead.
library(shiny)
#define UI
ui = fluidPage(
titlePanel("NYC crime visualization by modified zip code regions"),
sidebarLayout(
h5("blabla"),
sidebarPanel(
selectInput("date",
"Select a date (month starting from):",
choices = unique(crime_month$month_following))
)
),
mainPanel(
tabsetPanel(
tabPanel("Crime counts", leafletOutput("crime_counts")),
tabPanel("Crime rate", leafletOutput("crime_rate"))
)
)
)
#define server logic
server = function(input, output){
month_selected = reactive({
w = crime_month %>%
filter(month_following == input$date)
return(w)
})
output$crime_counts = renderLeaflet({
pal = colorBin(palette = "YlGn", 9, domain = crime_month$crime_counts)
labels = sprintf(
"<strong>%s</strong><br/>%g incidents in the month",
month_selected()$mod_zip, month_selected()$crime_counts) %>%
lapply(htmltools::HTML)
month_selected() %>%
st_transform(crs = "+init=epsg:4326") %>%
leaflet() %>%
addProviderTiles(provider = "CartoDB.Positron") %>%
setView(-73.9, 40.7, zoom = 10) %>%
addPolygons(label = labels,
stroke = FALSE,
smoothFactor = 0.5,
opacity = 1,
fillOpacity = 0.7,
fillColor = ~ pal(month_selected()$crime_counts),
highlightOptions = highlightOptions(weight = 5,
fillOpacity = 1,
color = "black",
opacity = 1,
bringToFront = TRUE)) %>%
addLegend("bottomright",
pal = pal,
values = ~ crime_counts,
title = "Crime incident counts",
opacity = 0.7)
})
output$crime_rate = renderLeaflet({
pal = colorBin(palette = "OrRd", 9, domain = crime_month$crime_rate)
labels = sprintf(
"<strong>%s</strong><br/>%g incidents in the month",
month_selected()$mod_zip, month_selected()$crime_rate) %>%
lapply(htmltools::HTML)
month_selected() %>%
st_transform(crs = "+init=epsg:4326") %>%
leaflet() %>%
addProviderTiles(provider = "CartoDB.Positron") %>%
setView(-73.9, 40.7, zoom = 10) %>%
addPolygons(label = labels,
stroke = FALSE,
smoothFactor = 0.5,
opacity = 1,
fillOpacity = 0.7,
fillColor = ~ pal(month_selected()$crime_rate),
highlightOptions = highlightOptions(weight = 5,
fillOpacity = 1,
color = "black",
opacity = 1,
bringToFront = TRUE)) %>%
addLegend("bottomright",
pal = pal,
values = ~ crime_rate,
title = "Crime incident rate",
opacity = 0.7)
})
}
Finally, run the shiny app!
Bug……
#shinyApp(ui = ui, server = server)